[2] 


[3] : 


[4] : 


[4] : 


[5] : 


[5]; 


customer-shopping-retail-sales-analysis 


import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt 


import seaborn as sns 


df-pd.read csv("/kaggle/input/customer-shopping-dataset/customer shopping data. 


eich!) 


df .he 


ad() 


invoice_no customer_id 


f G KM Pe OO 


1138884 
1317333 
1127801 
I173702 
1337046 


payment method 


f G KM PO 


Credit Card 
Debit Card 
Cash 
Credit Card 
Cash 


df .tail() 


99452 
99453 
99454 
99455 
99456 


99452 
99453 
99454 
99455 


invoice_no 
1219422 
1325143 
1824010 
1702964 
1232867 


March 18, 2023 


gender age category quantity 
C241288 Female 28 Clothing 5 
C111565 Male 21 Shoes 3 
C266599 Male 20 Clothing 1 
C988172 Female 66 Shoes 5 
C189076 Female 53 Books 4 
invoice_date shopping mall 
5/8/2022 Kanyon 
12/12/2021 Forum Istanbul 
9/11/2021 Metrocity 
16/05/2021 Metropol AVM 
24/10/2021 Kanyon 
customer_id gender age category 
C441542 Female 45 Souvenir 
C569580 Male 27 Food & Beverage 
C103292 Male 63 Food & Beverage 
C800631 Male 56 Technology 
C273973 Female 36 Souvenir 


payment_method invoice_date 
Card 21/09/2022 
Cash 22/09/2021 
Card 28/03/2021 
Cash 16/03/2021 


Credit 


Debit 


shopping mall 
Kanyon 

Forum Istanbul 
Metrocity 
Istinye Park 


price 
1500.40 
1800.51 
300.08 
3000.85 
60.60 


quantity 
5 


2 
2 
4 
3 


price 
58.65 
10.46 
10.46 
4200.00 
35.19 


X 


99456 Credit Card 15/10/2022 Mall of Istanbul 
[6]: df.shape 
[6]: (99457, 10) 
[7]: | df.size 
[7]: 994570 
[8]: df.isnullO.sum() 


[8]: invoice no 
customer id 
gender 
age 
category 
quantity 
price 
payment method 
invoice date 
shopping mall 
dtype: int64 


OOo O O O OO O O 


[9]: df.duplicated().value counts() 


[9]: False 99457 
dtype: int64 


[10]: df.columns 


[10]: Index(['invoice no', 'customer id', 'gender', 'age', 'category', 'quantity', 
'price', 'payment method', 'invoice date', 'shopping mall'], 
dtype-'object') 


[11]: df.info() 


<class 'pandas.core.frame.DataFrame'> 
RangeIndex: 99457 entries, 0 to 99456 
Data columns (total 10 columns): 


# Column Non-Null Count Dtype 
0 invoice_no 99457 non-null object 
1 customer_id 99457 non-null object 
2 gender 99457 non-null object 
3 age 99457 non-null int64 
4 category 99457 non-null object 
5 quantity 99457 non-null int64 
6 price 99457 non-null float64 


7 | payment method 99457 non-null object 
8 invoice_date 99457 non-null object 
9 shopping mall 99457 non-null object 
dtypes: float64(1), int64(2), object(7) 
memory usage: 7.6* MB 


[12]: df['invoice date']-df['invoice date'].apply(pd.to datetime) 
[13] : df.info() 


<class 'pandas.core.frame.DataFrame'> 
RangeIndex: 99457 entries, 0 to 99456 
Data columns (total 10 columns): 


# Column Non-Null Count Dtype 

0 invoice_no 99457 non-null object 

1 customer_id 99457 non-null object 

2 gender 99457 non-null object 

3 age 99457 non-null int64 

4 category 99457 non-null object 

5 quantity 99457 non-null int64 

6 price 99457 non-null float64 

7 | payment method 99457 non-null object 

8 invoice date 99457 non-null datetime64[ns] 


9 shopping mall 99457 non-null object 
dtypes: datetime64[ns] (1), float64(1), int64(2), object(6) 
memory usage: 7.6+ MB 


[14]: df.nunique() 


[14]: invoice_no 99457 
customer_id 99457 
gender 2 
age 52 
category 8 
quantity 5 
price 40 
payment method 3 
invoice_date 797 
shopping mall 10 


dtype: int64 


[15]: | # Calculate the count of each gender 
gender count = df['gender'].value counts() 


# Create pie chart 
fig, ax = plt.subplots() 
ax.pie(gender count, labels-gender count.index, autopct='%1.1£%%') 


ax.set_title('Customer Gender Distribution') 


plt.show() 


Customer Gender Distribution 


Female 


Male 


[31]: # Define age groups 
age groups = [0, 18, 25, 35, 45, 55, 65, 100] 
labels = O-i UaESICXAM hen T Ad — bal ebb O64 moo tu) 


# Create age groups based on the age column 
df['Age Group'] = pd.cut(df['age'], bins-age groups, labels-labels) 


# Create a histogram of age groups 

fig, ax = plt.subplots(figsize=(8, 6)) 

df['Age Group'].value counts().sort index( .plot(kind='bar', color-'red', ax=ax) 
ax.set xlabel('Age Group') 

ax.set ylabel('Count') 

ax.set title('Customer Age Group Distribution') 


# Add labels to histogram bars 
for i in ax.containers: 
ax. bar label (i) 


plt.show() 


Customer Age Group Distribution 
20000 


19016 19016 


17500 


15000 


12500 


10000 


Count 


7500 


5000 


2500 


st 
7 
us 
m 


Age Group 


0-17 
18-24 
25-34 
45-54 
55-64 

65+ 


[41]: | # create a bar chart to visualize the count of each category 
category count = df['category'].value counts() 
category count.plot(kind-'bar', color-'purple') 
plt.xlabel('Category') 
plt.ylabel('Count') 
plt.title('Count of Orders by Category') 


# add labels to the bars 


for i, count in enumerate(category_count): 
plt.text(i, count+10, str(count), ha='center', fontsize=9) 


plt.show() 


Count of Orders by Category 
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[64]: # create a horizontal bar chart to visualize the average quantity for each, 

ocategory 

category mean = df.groupby('category')['price'].meanO. 
osort values(ascending-False) 

plt.figure(figsize-(8,4)) 

category mean.plot(kind-'barh', color-'orange') 

plt.xlabel('Price') 

plt.ylabel('Category') 

plt.title('Average Price by Category') 


# add labels to the bars 
for i, mean in enumerate(category_mean): 
plt.text(mean-0.1, i, str(round(mean, 2)), va='center', fontsize=10) 


[32] : 


plt.show() 


Average Price by Category 
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# Calculate the count of each gender 
pay_count = df['payment method'].value, counts () 


# define the colors for each gender 
colors - ['skyblue', 'pink', 'lightgreen'] 


# Create pie chart 
fig, ax = plt.subplots() 


2000 2500 3000 


ax.pie(pay count, labels-pay count.index, colors=colors, autopct='/1.1£%%') 


ax.set title('Payment Method Distribution') 


plt.show() 


Payment Method Distribution 
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[37]: # get the count of each shopping mall 
mall count = df['shopping_mal1'] .value_counts() 


# create a bar chart to visualize the count of each shopping mall 
fig, ax = plt.subplots(figsize=(10, 6)) 

mall count.plot(kind-'bar', color-'green') 

plt.xlabel('Shopping Mall') 

plt.ylabel('Count') 


plt.title('Count of Customers by Shopping Mall') 


# add labels to the bars 
for i, count in enumerate(mall_count): 
plt.text(i, count+10, str(count), ha='center') 


plt.show() 


Count of Customers by Shopping Mall 
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[48]: # create a scatter plot to visualize the relationship between price and quantity 
plt.scatter(x-'price', y-'quantity', data-df) 
plt.xlabel('Price') 
plt.ylabel('Quantity') 
plt.title('Price vs Quantity') 
plt.show() 
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